Objectives

Objective of the study are:

Key Questions

  1. What are the top categories and sub-categories in terms of sales and profit?

  2. Are there any specific products that contribute significantly to profit?

  3. What is the distribution of sales, quantity, discount, and profit?

  4. How are sales and profit distributed across different regions, categories, and sub-categories?

  5. Which region has the highest sales?

  6. How do profit and sales vary across different regions?

  7. How does the average sales vary across different ship modes?

  8. Are there any products that can consistently contribute to high profits?

  9. How does the application of discounts impact the sales performance of different products or categories?

  10. How does the profit margin vary across different products?

Project Analysis

1. Summary

summary_df <- summary(Sample_EU_Superstore[c("Sales", "Quantity", "Discount", "Profit")])
summary_df <- as.data.frame(as.table(summary_df))
colnames(summary_df) <- c("Variable", "Statistic", "Value")

# Replace 'Variable' column values with the actual variable names
summary_df$Variable <- c("Sales", "Quantity", "Profit", "Discount")

# Assuming your summary output looks something like this
summary_df <- summary(Sample_EU_Superstore[c("Sales", "Quantity", "Discount", "Profit")])
summary_df <- as.data.frame(as.table(summary_df))
colnames(summary_df) <- c("Variable", "Statistic", "Value")

# Replace 'Variable' column values with the actual variable names
summary_df$Variable <- c("Sales", "Quantity", "Profit", "Discount")

2. Total Sales Across Regions

# Region-wise analysis
region_sales <- Sample_EU_Superstore %>% 
  group_by(Region) %>% 
  summarise(total_sales = sum(Sales))
as.data.frame(print(region_sales))
## # A tibble: 3 × 2
##   Region  total_sales
##   <chr>         <dbl>
## 1 Central    1720553.
## 2 North       625575.
## 3 South       591961.
##    Region total_sales
## 1 Central   1720552.6
## 2   North    625575.0
## 3   South    591961.5
# Bar plot for sales across regions with Plotly
region_sales_plot<- ggplot(region_sales, aes(x = Region, y = total_sales, fill = Region)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Sales Across Regions", x = "Region", y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert ggplot to plotly
region_plot<-ggplotly(region_sales_plot)
region_plot

3. Proportions for Region Sales

#Proportions
region_sales_prop <- Sample_EU_Superstore %>% 
  group_by(Region) %>% 
  summarise(total_sales = sum(Sales))
# Calculate the percentage contribution
region_sales_prop$Percentage <- (region_sales$total_sales / sum(region_sales$total_sales)) * 100
# Print the results
print(region_sales_prop)
## # A tibble: 3 × 3
##   Region  total_sales Percentage
##   <chr>         <dbl>      <dbl>
## 1 Central    1720553.       58.6
## 2 North       625575.       21.3
## 3 South       591961.       20.1
# Stacked bar plot for percentage contribution of sales by region with Plotly
region_plot <- ggplotly(
  ggplot(region_sales_prop, aes(x = Region, y = Percentage, fill = Region)) +
    geom_bar(stat = "identity", position = "stack") +
    labs(title = "Percentage Contribution of Sales by Region",
         x = "Region",
         y = "Percentage (%)") +
    theme_minimal()
)
region_plot

4. Category Wise Analysis

# Category-wise analysis
category_sales <- Sample_EU_Superstore %>% 
  group_by(Category) %>% 
  summarise(total_sales = sum(Sales)) 
  print(category_sales)
## # A tibble: 3 × 2
##   Category        total_sales
##   <chr>                 <dbl>
## 1 Furniture           779103.
## 2 Office Supplies    1046307.
## 3 Technology         1112679.
# Convert total_sales to decimal format
category_sales$total_sales <- format(category_sales$total_sales, scientific = FALSE)
# Bar plot for sales across categories
category_sales_plot <- ggplot(category_sales, aes(x = Category, y = total_sales, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Sales Across Categories", x = "Category", y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert ggplot to plotly
category_sales_plotly <- ggplotly(category_sales_plot)
# Display the plotly object
category_sales_plotly

5. Sub_Category Analysis

## Sub-category-wise analysis
sub_category_profit <- Sample_EU_Superstore %>% 
  group_by(`Sub-Category`) %>% 
  summarise(total_profit = sum(Profit))
print(sub_category_profit)
## # A tibble: 17 × 2
##    `Sub-Category` total_profit
##    <chr>                 <dbl>
##  1 Accessories          33442.
##  2 Appliances           46337.
##  3 Art                  30794.
##  4 Binders              19760.
##  5 Bookcases            56407.
##  6 Chairs               19792.
##  7 Copiers              56200.
##  8 Envelopes             8640.
##  9 Fasteners             4428.
## 10 Furnishings          13650.
## 11 Labels                3289.
## 12 Machines             17287.
## 13 Paper                 8884.
## 14 Phones               37344.
## 15 Storage              27950.
## 16 Supplies              9625.
## 17 Tables              -20998.
sub_category_profit_plot <- ggplot(sub_category_profit, aes(x = `Sub-Category`, y = total_profit, fill = `Sub-Category`)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Profit Across Sub-categories", x = "Sub-category", y = "Total Profit") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
sub_category_profit_plotly <- ggplotly(sub_category_profit_plot)
sub_category_profit_plotly

6. Quarterly Sales Analysis

# Convert Order Date to Date format
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)

# Extract quarter from Order Date
Sample_EU_Superstore$Quarter <- quarter(Sample_EU_Superstore$`Order Date`, with_year = TRUE)

# Aggregate Quarterly Sales by Segments
quarterly_sales <- Sample_EU_Superstore %>%
  group_by(Segment, Quarter) %>%
  summarise(total_sales = sum(Sales))
## `summarise()` has grouped output by 'Segment'. You can override using the
## `.groups` argument.
print(quarterly_sales)
## # A tibble: 48 × 3
## # Groups:   Segment [3]
##    Segment  Quarter total_sales
##    <chr>      <dbl>       <dbl>
##  1 Consumer   2015.      24033.
##  2 Consumer   2015.      55360.
##  3 Consumer   2015.      92606.
##  4 Consumer   2015.      84076.
##  5 Consumer   2016.      64388.
##  6 Consumer   2016.      87087.
##  7 Consumer   2016.     125014.
##  8 Consumer   2016.      98498.
##  9 Consumer   2017.      74263.
## 10 Consumer   2017.      94378.
## # ℹ 38 more rows
# Plot Quarterly Sales by Segments
quarterly_sales_plot <- ggplot(quarterly_sales, aes(x = Quarter, y = total_sales, color = Segment)) +
  geom_line(size = 1.5) +
  geom_point(size = 3) +
  labs(title = "Quarterly Sales by Segments",
       x = "Quarter",
       y = "Total Sales",
       color = "Segments") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the interactive plot using plotly
ggplotly(quarterly_sales_plot)

7. Yearly Sales Forecast

# Convert Order Date to Date format
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)
# Define a function to compute the desired metric (e.g., rolling average) within the rolling window
rolling_window_function <- function(data) {
  # Replace 'Sales' with the desired metric (Sales, Profit, Quantity, Discount)
  result <- mean(data, na.rm = TRUE)
  return(result)
}
# Set the size of the rolling window (e.g., 3 months)
window_size <- 3
# Apply rolling window analysis to the entire dataset
rolling_window_results <- Sample_EU_Superstore %>%
  arrange(`Order Date`) %>%
  group_by(`Order Date`) %>%
  summarise(Sales = sum(Sales)) %>%
  mutate(rolling_sales = zoo::rollapply(Sales, window_size, rolling_window_function, align = "right", fill = NA, partial = TRUE))
print(rolling_window_results) # Display the results
## # A tibble: 1,253 × 3
##    `Order Date`  Sales rolling_sales
##    <date>        <dbl>         <dbl>
##  1 2015-01-01     44.9          44.9
##  2 2015-01-03    854.          450. 
##  3 2015-01-04    183.          361. 
##  4 2015-01-05    297.          445. 
##  5 2015-01-06    324.          268. 
##  6 2015-01-07    910.          510. 
##  7 2015-01-08   1103.          779. 
##  8 2015-01-10    206.          739. 
##  9 2015-01-11   1969.         1093. 
## 10 2015-01-12   1671.         1282. 
## # ℹ 1,243 more rows
print(rolling_window_results)
## # A tibble: 1,253 × 3
##    `Order Date`  Sales rolling_sales
##    <date>        <dbl>         <dbl>
##  1 2015-01-01     44.9          44.9
##  2 2015-01-03    854.          450. 
##  3 2015-01-04    183.          361. 
##  4 2015-01-05    297.          445. 
##  5 2015-01-06    324.          268. 
##  6 2015-01-07    910.          510. 
##  7 2015-01-08   1103.          779. 
##  8 2015-01-10    206.          739. 
##  9 2015-01-11   1969.         1093. 
## 10 2015-01-12   1671.         1282. 
## # ℹ 1,243 more rows
# Line chart for rolling sales with enhanced readability
plot <- ggplot(rolling_window_results, aes(x = `Order Date`, y = rolling_sales)) +
  geom_line(color = "lightblue", linewidth = 1, linetype = "solid") +
  geom_smooth(method = "loess", se = FALSE, color = "green", linetype = "dashed") +
  labs(title = "Rolling Window Analysis - Sales Over Time",
       x = "Order Date",
       y = "Rolling Sales",
       caption = "Dashed line represents a smoothed trend") +
  theme_minimal() +
  theme(legend.position = "none") +
  annotate("text", x = max(rolling_window_results$`Order Date`), y = max(rolling_window_results$rolling_sales), 
           label = "Smoothed Trend", hjust = 1.2, vjust = 0.5, color = "orange") +
  geom_point(size = 2, color = "pink") 
rolling_window_plot <- ggplotly(plot); rolling_window_plot
## `geom_smooth()` using formula = 'y ~ x'

8. Yearly Sales and Profit

# Convert 'Order Date' to a Date type
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)

# Extract year from 'Order Date'
Sample_EU_Superstore <- Sample_EU_Superstore %>%
  mutate(Year = year(`Order Date`))

# Group by year and calculate total sales and profit
yearly_summary <- Sample_EU_Superstore %>%
  group_by(Year) %>%
  summarise(TotalSales = sum(Sales),
            TotalProfit = sum(Profit))
print(yearly_summary)
## # A tibble: 4 × 3
##    Year TotalSales TotalProfit
##   <dbl>      <dbl>       <dbl>
## 1  2015    477797.      61376.
## 2  2016    652647.      84025.
## 3  2017    765441.      98484.
## 4  2018   1042204.     128944.
# Visualize the results with a line chart
ggplotly(ggplot(yearly_summary, aes(x = factor(Year), y = TotalSales, group = 1)) +
  geom_line(aes(color = "Total Sales"), size = 1) +
  geom_line(aes(y = TotalProfit, color = "Total Profit"), size = 1) +
  ggtitle("Yearly Sales and Profit") +
  xlab("Year") +
  ylab("Amount") +
  scale_color_manual(values = c("Total Sales" = "#4e79a7", "Total Profit" = "#e15759")) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_blank(),
    legend.text = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white"),
    legend.position = "top"
  )
)

9. TOP 10 Products by Total Profit

###Product Analysis
# Group by product and calculate total sales, profit, and profit margin
product_summary <- Sample_EU_Superstore %>%
  group_by(`Product ID`, `Product Name`) %>%
  summarise(TotalSales = sum(Sales),
            TotalProfit = sum(Profit),
            ProfitMargin = sum(Profit) / sum(Sales) * 100) %>%
  arrange(desc(TotalProfit))
## `summarise()` has grouped output by 'Product ID'. You can override using the
## `.groups` argument.
# Select the top N products (adjust N as needed)
top_products <- head(product_summary, 10)

# Visualize the results with rotated x-axis labels
ggplotly(ggplot(top_products, aes(x = reorder(`Product Name`, -TotalProfit), y = TotalProfit)) +
  geom_bar(stat = "identity",fill = "yellow") +
  ggtitle("Top 10 Products by Total Profit") +
  xlab("Product Name") +
  ylab("Total Profit") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate labels for better readability
)

10. Average Sales with Different Discounts

#Discount Analysis
# Assuming you have a column 'Discount' in your dataset
discount_analysis <- Sample_EU_Superstore %>%
  group_by(Category, `Product Name`, Discount) %>%
  summarise(AvgSales = mean(Sales))
## `summarise()` has grouped output by 'Category', 'Product Name'. You can
## override using the `.groups` argument.
discount_analysis
## # A tibble: 3,819 × 4
## # Groups:   Category, Product Name [1,856]
##    Category  `Product Name`            Discount AvgSales
##    <chr>     <chr>                        <dbl>    <dbl>
##  1 Furniture Advantus Clock, Black          0      132. 
##  2 Furniture Advantus Clock, Duo Pack       0      175. 
##  3 Furniture Advantus Clock, Duo Pack       0.3    144. 
##  4 Furniture Advantus Clock, Durable        0      242. 
##  5 Furniture Advantus Clock, Durable        0.1    218. 
##  6 Furniture Advantus Clock, Durable        0.3     67.8
##  7 Furniture Advantus Clock, Erganomic      0      100. 
##  8 Furniture Advantus Clock, Erganomic      0.3    140. 
##  9 Furniture Advantus Door Stop, Black      0      158. 
## 10 Furniture Advantus Door Stop, Black      0.4     27.0
## # ℹ 3,809 more rows
# Visualize the results

plot <- ggplot(discount_analysis, aes(x = Discount, y = AvgSales, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Average Sales with Different Discounts") +
  xlab("Discount") +
  ylab("Average Sales") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Use ggplotly to convert to interactive plotly plot
plotly_plot <- ggplotly(plot)
plotly_plot

11. Top Profitable Products

#product analysis
# Calculate total profit for each product
product_profit <- Sample_EU_Superstore %>%
  group_by(`Product ID`, `Product Name`) %>%
  summarize(TotalProfit = sum(Profit))
## `summarise()` has grouped output by 'Product ID'. You can override using the
## `.groups` argument.
# Identify top products with consistently high profits
top_profitable_products <- product_profit %>%
  filter(TotalProfit > quantile(TotalProfit, 0.75))  # Adjust the quantile threshold as needed
top_profitable_products
## # A tibble: 23 × 3
## # Groups:   Product ID [23]
##    `Product ID`    `Product Name`                       TotalProfit
##    <chr>           <chr>                                      <dbl>
##  1 FUR-CH-10002212 Hon Chairmat, Black                        -52.7
##  2 FUR-CH-10003249 Novimex Chairmat, Adjustable               316. 
##  3 FUR-FU-10000295 Advantus Light Bulb, Durable                57.1
##  4 FUR-FU-10000361 Eldon Stacking Tray, Duo Pack               24.6
##  5 FUR-FU-10001252 Eldon Frame, Duo Pack                      419. 
##  6 OFF-AP-10003577 Hoover Stove, Silver                       364. 
##  7 OFF-AP-10003758 KitchenAid Blender, Silver                 326. 
##  8 OFF-AR-10003651 Sanford Pencil Sharpener, Easy-Erase      1028. 
##  9 OFF-BI-10002570 Acco Binder Covers, Clear                  243. 
## 10 OFF-BI-10003708 Avery Binder, Economy                      162. 
## # ℹ 13 more rows
# Visualize top profitable products
ggplotly(ggplot(top_profitable_products, aes(x = reorder(`Product Name`, -TotalProfit), y = TotalProfit)) +
  geom_bar(stat = "identity", fill = "pink") +
  labs(title = "Top Profitable Products",
       x = "Product Name",
       y = "Total Profit") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

12. Average Sales Across Different Ship Modes

###ship mode analysis: 
# Group the data by ship mode and calculate the average sales for each mode
ship_mode_sales <- Sample_EU_Superstore %>%
  group_by(`Ship Mode`) %>%
  summarise(AverageSales = mean(Sales, na.rm = TRUE))
# Print the result
print(ship_mode_sales)
## # A tibble: 4 × 2
##   `Ship Mode`    AverageSales
##   <chr>                 <dbl>
## 1 First Class            284.
## 2 Same Day               319.
## 3 Second Class           304.
## 4 Standard Class         291.
# Create a more attractive bar plot
ggplotly(ggplot(ship_mode_sales, aes(x = reorder(`Ship Mode`, -AverageSales), y = AverageSales, fill = `Ship Mode`)) +
  geom_bar(stat = "identity", size = 0.7) +
  labs(title = "Average Sales Across Different Ship Modes",
       x = "Ship Mode",
       y = "Average Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
  scale_fill_brewer(palette = "Set3")  # You can choose a different color palette
)